{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#bert-base-NER classify each token as one of the following :\n",
    "\n",
    "# O: Outside of a named entity\n",
    "# B-MISC: Beginning of a miscellaneous entity right after another miscellaneous entity\n",
    "# I-MIS: Miscellaneous entity\n",
    "# B-PER: Beginning of a person's name right after another person's name\n",
    "# I-PER: Person's name\n",
    "# B-ORG: Beginning of an organization right after another organization\n",
    "# I-ORG: organization\n",
    "# B-LOC: Beginning of a location right after another location\n",
    "# I-LOC: Location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pip install transformers\n",
    "pip install langchain_community"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/codespace/.python/current/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
      "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
    "from transformers import pipeline\n",
    "\n",
    "model_id = \"dslim/bert-base-NER\"\n",
    "\n",
    "# load the tokenizer from huggingface\n",
    "tokenizer = AutoTokenizer.from_pretrained(\n",
    "    model_id\n",
    ")\n",
    "# load the NER model from huggingface\n",
    "model = AutoModelForTokenClassification.from_pretrained(\n",
    "    model_id\n",
    ")\n",
    "# load the tokenizer and model into a NER pipeline\n",
    "ner = pipeline(\n",
    "    \"ner\",\n",
    "    model=model,\n",
    "    tokenizer=tokenizer,\n",
    "    aggregation_strategy=\"max\",\n",
    "    # device=device\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pdf 1 - Harry Potter and the Death Hallows Summary.pdf\n",
    "\n",
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader('../data/Harry Potter and the Death Hallows Summary.pdf')\n",
    "pages = loader.load_and_split()\n",
    "\n",
    "contents=''\n",
    "for i in range(0,len(pages)):\n",
    "    contents=' '.join([contents,pages[i].page_content.replace('\\n',' ')])\n",
    "\n",
    "ner(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pdf 2 - About Amazon.pdf\n",
    "\n",
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader('../data/About Amazon.pdf')\n",
    "pages = loader.load_and_split()\n",
    "\n",
    "contents=''\n",
    "for i in range(0,len(pages)):\n",
    "    contents=' '.join([contents,pages[i].page_content.replace('\\n',' ')])\n",
    "\n",
    "ner(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pdf 3 - Apple stock during pandemic.pdf\n",
    "\n",
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader('../data/Apple stock during pandemic.pdf')\n",
    "pages = loader.load_and_split()\n",
    "\n",
    "contents=''\n",
    "for i in range(0,len(pages)):\n",
    "    contents=' '.join([contents,pages[i].page_content.replace('\\n',' ')])\n",
    "\n",
    "ner(contents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pdf 4 - Bank of America Q23.pdf\n",
    "\n",
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "\n",
    "loader = PyPDFLoader('../data/Bank of America Q23.pdf')\n",
    "pages = loader.load_and_split()\n",
    "\n",
    "contents=''\n",
    "for i in range(0,len(pages)):\n",
    "    contents=' '.join([contents,pages[i].page_content.replace('\\n',' ')])\n",
    "\n",
    "ner(contents)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
